# 1) model the ratings
# 2) relationship between reviews and ratings
# 8) correlation between age and # restaurants/# reviews
# 9) correlation between Chinese population and # Chinese restaurants and # reviews on Chinese restaurants
library("scales")
library("e1071")
library("ggplot2")
# 1) modeling the ratings
# Yelp's business proposition is the word of mouth: user-generated reviews and ratings.
# How does rating work? You give a rating (1~5) when you review a restaurant on Yelp.
# Then Yelp presumably collects all the ratings for each restuarant and generate a 1-5 with
# 0.5 increments. We have no knowledge about how actually Yelp computed these
# displayed ratings, but can we model the ratings?
rm(list = ls())
Yelp <- read.csv("DATA_JLH_CL.csv")
# set up the variables
rating<-Yelp$rating
review<-Yelp$num_reviews
# A basic box plot
qplot(y=rating)+geom_boxplot(fill="yellow",alpha=0.4)+
ggtitle("Boxplot of sample restaurant ratings in Boston") +
ylab("Restaurant Ratings on Yelp (1-5 stars)" ) +
theme(plot.title = element_text(size=20, face="bold", vjust=2))

# there are no ratings less than 3 ?! Are ALL restaurants that good?
# look at a histogram of ratings - BONUS 6 (new plots)
hist.rating<-qplot(rating, geom = "blank") +
geom_histogram(aes(y = ..density..),breaks=seq(2.75,5.25,0.5),
, alpha = 0.4,colour="black",fill="white") +
ggtitle("Distribution of Restaurant Ratings on Yelp") +
xlab("Restaurant Ratings on Yelp") +
ylab("Density") +
theme(plot.title = element_text(size=20, face="bold", vjust=2))
hist.rating+ stat_function(fun=dnorm, args=list(mean=mean(rating), sd=sd(rating)),
colour="#0072B2") +
geom_vline(xintercept =quantile(rating,c(0.025,0.975)) , color="red", label="zero")

# The distribution of ratings is so centered on the 3.5 to 4.5 and looks a bit normal.
# Why so?
# We suspect it is the CLT at work:
# for large enough samples, the distribution of the sum or mean is approxiamtely normal.
# If we assume ratings are rounded averages of all the ratings people give a restuarant,
# then the more reviews/ratings (larger sample size),
# the more normal the displayed rating scores (sample means) will be.
# Although we don't know the exact algorithm for computing the ratings,
# we can verify our hypothesis by segmenting our sample by the number of reviews (n)
# We have three ways to measure how normal the ratings get when the sample size increases,
# or when we get rid of more smaller samples
# Approach 1: skewness and kurtosis study - BONUS 13
# We look at the skewness and kurtosis for different sample sizes
# define our own functions to automate the study - BONUS 11
# input: n - split sample into ratings with more than n and less than n reviews
# output: skewness and kurtosis for each segmented sample
CLT<-function(n) {
index<-which(review<n)
rating0<-rating[index]
rating1<-rating[-index]
clt<-c(skewness(rating0),skewness(rating1),kurtosis(rating0),kurtosis(rating1))
clt
}
summary(review) # the maximum review is 2132, the minimum 7 so we can choose n from the range
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.0 65.5 138.0 206.0 267.0 2130.0
level<-seq(8,2132,1) # specify the level (n) by which we split the sample
clt<-data.frame(matrix(,ncol=0,nrow=0))
for (n in level) {
clt<-rbind(clt,c(n,CLT(n)))
colnames(clt)<-c("n","skewness less than n","skewness more than n",
"kurtosis less than n","kurtosis more than n")
}
head(clt) # a table with skewness and kurtosis
## n skewness less than n skewness more than n kurtosis less than n
## 1 8 NaN -0.02780 NaN
## 2 9 NaN -0.02668 NaN
## 3 10 0.7500 -0.04087 -1.6875
## 4 11 1.0733 -0.03984 -0.9200
## 5 12 0.0000 -0.03740 0.0625
## 6 13 0.2044 -0.04968 0.6266
## kurtosis more than n
## 1 -0.012116
## 2 -0.006942
## 3 -0.023232
## 4 -0.018100
## 5 -0.010668
## 6 -0.013253
# Skewness study
#ggplot graph - BONUS 6
ggplot(clt, aes(x=level, y=clt[,2])) +
geom_point(shape=1,col=rgb(1/2,0,1/4,1/4)) +
geom_hline(yintercept = 0, color="yellow", label="zero") +
ggtitle("Skewness values for ratings with more than n reviews") +
xlab("n number of reviews") +
ylab("Skewness") +
theme(plot.title = element_text(size=20, face="bold", vjust=2))
## Warning: Removed 2 rows containing missing values (geom_point).

# this is very convincing, as n increases, the skewness approaches 0
ggplot(clt, aes(x=level, y=clt[,3])) +
geom_point(shape=1,col=rgb(1,0,1/2,1/4)) +
geom_hline(yintercept = 0, color="7", label="zero") +
ggtitle("Skewness values for ratings with less than n reviews") +
xlab("n number of reviews") +
ylab("Skewness") +
theme(plot.title = element_text(size=20, face="bold", vjust=2))
## Warning: Removed 415 rows containing missing values (geom_point).

# also approaches 0, as we get rid of the smaller samples
length(which(abs(clt[,2])<=0.5))/2132 # percentage of skewness within -0.5 and 0.5
## [1] 0.9934
length(which(abs(clt[,3])<=0.5))/2132 # percentage of skewness within -0.5 and 0.5
## [1] 0.8021
# Most of the skewness values are within -0.5 and 0.5, normality is quite a good fit.
#Kurtosis study
ggplot(clt, aes(x=level, y=clt[,4])) +
geom_point(shape=1,col=rgb(0,0,1/4,1/4)) +
geom_hline(yintercept = 0, color="black", label="zero") +
ggtitle("Excess kurtosis for ratings with less than n reviews") +
xlab("n number of reviews") +
ylab("Excess kurtosis") +
theme(plot.title = element_text(size=20, face="bold", vjust=2))
## Warning: Removed 2 rows containing missing values (geom_point).

# excess kurtosis approaches 0, as n increases; so normality is a good fit
# limitations of normality
ggplot(clt, aes(x=level, y=clt[,5])) +
geom_point(shape=1,col=rgb(1/4,0,1,1/2)) +
geom_hline(yintercept = 0, color="black", label="zero") +
ggtitle("Excess kurtosis for ratings with more than n reviews") +
xlab("n number of reviews") +
ylab("Excess kurtosis") +
theme(plot.title = element_text(size=20, face="bold", vjust=2))
## Warning: Removed 415 rows containing missing values (geom_point).

# as we get rid of the smaller samples, kurtosis is close to 0 when n< 500 but
# then strays away far from 0 when n > 500
length(rating[which(review>500)]) # the observations with more than 500 reviews
## [1] 79
length(rating[which(review>1000)]) # the observations with more than 1000 reviews
## [1] 15
# since the observations in our data get fewer and fewer until exhausted.
# 500 is the limit beyond which the data are too scarce to be showing normality
# Approach 2: Central Limit Theorem (CLT) approximation
# Since there are more ratings in the middle values, we wonder:
# What exactly is the probability that a rating score is 3.5 to 4.5?
# we define a function to automate our study
# input: n - divide the samples into ones with less than n reviews and ones with more than n reviews
# output: for each divided sample,
# the exact proportion of ratings from 3.5 to 4.5;
# CLT approximation with continuity correction;
# and error - how much in percentage CLT approximation misses the exact answer
CLTprob<-function(n) {
index<-which(review<n)
rating0<-rating[index]
rating1<-rating[-index]
exact0<-length(which((rating0<=4.5)&(rating0>=3.5)))/length(rating0)
prob0<-pnorm(4.75, mean(rating0),sd(rating0))-pnorm(3.25, mean(rating0),sd(rating0))
error0<-abs(prob0-exact0)/exact0
exact1<-length(which((rating1<=4.5)&(rating1>=3.5)))/length(rating1)
prob1<-pnorm(4.75, mean(rating1),sd(rating1))-pnorm(3.25, mean(rating1),sd(rating1))
error1<-abs(prob1-exact1)/exact1
cltprob<-cbind(c(exact0,exact1),c(prob0,prob1),c(error0,error1)*100)
rownames(cltprob)<-c("more than n","less than n")
colnames(cltprob)<-c("Exact probability","CLT approximation","Error in %")
cltprob
}
# the probability of ratings from 3.5 to 4.5 for different levels
CLTprob(8)[2,] # the original sample
## Exact probability CLT approximation Error in %
## 0.9719 0.9587 1.3667
CLTprob(30)
## Exact probability CLT approximation Error in %
## more than n 0.9412 0.9097 3.345
## less than n 0.9748 0.9631 1.206
CLTprob(70)
## Exact probability CLT approximation Error in %
## more than n 0.9660 0.9482 1.851
## less than n 0.9741 0.9625 1.192
CLTprob(100)
## Exact probability CLT approximation Error in %
## more than n 0.9740 0.9566 1.786
## less than n 0.9707 0.9601 1.089
CLTprob(200)
## Exact probability CLT approximation Error in %
## more than n 0.9653 0.9493 1.657
## less than n 0.9836 0.9734 1.032
CLTprob(500)
## Exact probability CLT approximation Error in %
## more than n 0.9707 0.9555 1.55768
## less than n 0.9873 0.9877 0.03835
CLTprob(1000)
## Exact probability CLT approximation Error in %
## more than n 0.9715 0.9582 1.377
## less than n 1.0000 0.9783 2.171
plot(1,1, type ="n", xlim = c(5,2100), ylim = c(0,10),
ylab="Errors in %", main="CLT Approximation Errors in %" ,
xlab="n, the number of reviews for the restaurants")
legend(1300,6.5, lty=c(1,1),c("Less than n","More than n"), lwd=c(2.5,2.5),col=c("black","red"))
for (i in 10:2000) {
points(i,CLTprob(i)[1,3])
points(i,CLTprob(i)[2,3], col ="red")
}

# We observe from these tables that as the sample sizes increases,
# the errors get smaller, CLT approximations get better;
# if we get rid of the small samples until n=500, the CLT approximations also get better
# again, above 500 reviews the data get so few and the errors stray away.
# Approach 3: Graphical approach - BONUS 6 and 12 (Watch the rainbow curves!)
# We will fit a normal curve to different sample sizes from 30 to 500 reviews
# define functions that generate a accustomed histogram and overlay a normal curve
CLT.hist<-function(n){
index<-which(review<n)
rating0<-rating[index]
hist.rating<-qplot(rating0, geom = "blank") +
geom_histogram(aes(y = ..density..),breaks=seq(2.75,5.25,0.5),
, alpha = 0.4,colour="black",fill="white") +
stat_function(fun=dnorm, args=list(mean=mean(rating0), sd=sd(rating0)),
color=sample(20,1)) +
ggtitle("Distribution of Restaurant Ratings on Yelp") +
xlab("Restaurant Ratings on Yelp") +
ylab("Density") +
theme(plot.title = element_text(size=20, face="bold", vjust=2))
hist.rating
}
# The normal curves are not a bad fit
CLT.hist(30)# ratings with less than 30 reviews

CLT.hist(70)# ratings with less than 70 reviews

CLT.hist(100)# ratings with less than 100 reviews

CLT.hist(200)# ratings with less than 200 reviews

CLT.hist(300) # ratings with less than 300 reviews

CLT.hist(500) # ratings with less than 500 reviews

CLT.hist(2132) # the entire sample

# Also note the shift of distribution from left-skewed to right-skewed:
# the "shoulders" of the histograms move from one way to the other, like breakdancing!
# If we overlay these normal curves, we discover something interesting going on:
hist.rating# set the historgram of the entire sample as the background

CLT.curve<-function(g,n,col){
index<-which(review<n)
rating0<-rating[index]
hist.rating<-g+ stat_function(fun=dnorm, args=list(mean=mean(rating0), sd=sd(rating0)),
color=col)
hist.rating
}
rainbow<-c("Red","Orange","Yellow",'Green','Blue','Violet','magenta')
g1<-CLT.curve(hist.rating,30,rainbow[1]);g1 # ratings with less than 30 reviews

g2<-CLT.curve(g1,70,rainbow[2]);g2 # ratings with less than 70 reviews

g3<-CLT.curve(g2,100,rainbow[3]);g3 # ratings with less than 100 reviews

g4<-CLT.curve(g3,200,rainbow[4]);g4 # ratings with less than 200 reviews

g5<-CLT.curve(g4,300,rainbow[5]);g5 # ratings with less than 300 reviews

g6<-CLT.curve(g5,500,rainbow[6]);g6 # ratings with less than 500 reviews

g<-CLT.curve(g5,2132,rainbow[7]);g # our entire sample

# As the sample size increases, the rainbow curves shift to the left!
# That means, as the number of reviews increase, the ratings tend to decrease!
# To dig into the truth, we will look at the correlation between review and ratings later.
# With the 3 approaches to model the ratings, we verified our CLT hypothesis:
# With enough reviews, a normal distribution could have generated the ratings in our sample!
# If Yelp ignored the users' ratings, could they have used this distribution for the ratings?
# We have found a relationship that might have not been statistically significant!
# BONUS 9
# REQUIREMENT 2 - Student t confidence interval:
# What is the average rating of Bostonian restaurants?
# Now that we assume each rating is a sample mean of the ratings for each restaurant,
# and show that they are quite normally distributed; with a large sample of these means,
# we naturally want a student t confidence interval for the whole population in Boston.
student<-(rating-mean(rating))/(sd(rating)/sqrt(length(rating)))
hist(student,breaks=seq(-150,150,20),freq=FALSE,main="A t distribution?",xlab="rescaled student variable")
curve(dt(x,n-1),col='red',add=TRUE) # does not fit at all

# We failed! Why?
# This is because our ratings are discrete values, and are not strictly normal variables,
# so we are not justified to use student t confidence interval
# but for the sake of comparison later, we can bend the theory:
classic.t<-t.test(rating,conf.level=.99);classic.t
##
## One Sample t-test
##
## data: rating
## t = 342.3, df = 998, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 99 percent confidence interval:
## 3.93 3.99
## sample estimates:
## mean of x
## 3.96
# a very small interval with so many degrees of freedom!
# we are 99% confident that the true mean ratings is practically 4. Quite high a number!
# with a large sample, the best is to use a boostrap t confidence interval
xbar <- mean(rating); xbar #sample mean
## [1] 3.96
S <- sd(rating); S #sample standard deviation
## [1] 0.3656
n <- length(rating);n
## [1] 999
SE <- S/(sqrt(n)) ; SE #sample standard error
## [1] 0.01157
#Check our methodology with a single bootstrap resample
x <-sample(rating, size = n, replace = TRUE) #resample
Tstar<-(mean(x) - xbar)/(sd(x)/sqrt(n)); Tstar #a t statistic
## [1] -0.3896
#Now we will estimate the distribution of the t statistic
N = 10^4; Tstar = numeric(N) #vector of t statistics
means = numeric(N); StdErrs = numeric(N)
for (i in 1:N) {
x <-sample(rating, size = n, replace = TRUE)
Tstar[i] <-(mean(x) - xbar)/(sd(x)/sqrt(n))
means[i] = mean(x); StdErrs[i] = sd(x)/sqrt(n)
}
#The bootstrap t statistic is approximately normal except for the tails
qqnorm(Tstar)
qqline(Tstar)

# BONUS 6 - new plots
qplot(Tstar, geom = "blank") +
geom_histogram(aes(y = ..density..)
, alpha = 0.4,colour="black",fill="white") +
stat_function(fun=dt, args=n-1,
colour="blue") +
ggtitle("Boostrap Distribution of Ratings") +
xlab("Boostrap statistics") +
ylab("Density") +
theme(plot.title = element_text(size=20, face="bold", vjust=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

#A Student t curve is quite good a match except for the center
#To get a confidence interval, use the bootstrap quantiles along with the sample mean and standard deviation
q<-quantile(Tstar, c(.005, .995),names=FALSE)
L <- xbar - q[2]*SE; U <- xbar - q[1]*SE; L; U
## [1] 3.931
## [1] 3.989
#Here, for comparison, is the bootstrap percentile and the classical t confidence interval
quantile(means, c(.005, .995),names=FALSE)
## [1] 3.931 3.989
classic.t$conf.int
## [1] 3.93 3.99
## attr(,"conf.level")
## [1] 0.99
# we can display these 3 intervals
hist(rating, breaks=seq(2.75,5.25,0.5), xlim=c(2.5,5.5),freq=FALSE,
main="Distribution of sample restaurant ratings in Boston") # the whole sample
abline(v=c(L,U),col='black',lty=3)
abline(v=quantile(means, c(.005, .995),names=FALSE),col='red',lty=4)
abline(v=classic.t$conf.int,col='blue',lty=5)

# these 3 intervals are so close to each other that we may practically use any one
# but theoretically, we are only allowed to use the boostrap t statistics.
# so we can claim with 99% confidence that the true mean rating of Bostonian restaurants is 4!
# Good job, Boston!
# Well, now we also have a sense of how inflated the ratings can be.
# BONUS 14 - Chi-square distribution
# As I recall from W7 section problem 2, we rescaled a discrete random variable X~binom
# and produced chi-square statistics out of it with the student t approach;
# Knowing that our ratings are roughly normal but discrete,
# are we able to theoretically replicate Chi-square distributions?
mu<-mean(rating);mu # unbiased estimator of the mean of the population
## [1] 3.96
sigma<-sqrt((n-1)/n*var(rating)); sigma # unbiased estimator of signma of the population
## [1] 0.3654
x<-(sample(rating, 10,replace=T)-mu)/sigma; x # we make it a standard normal variable
## [1] 0.1096 0.1096 -1.2587 0.1096 0.1096 -1.2587 0.1096 0.1096
## [9] 1.4779 -1.2587
# let's simulate 10000 times
# large sample is a data luxury: we can choose a small sample this time
df<-50
N<-10000; means <-numeric(N); vars<-numeric(N); sumsq <- numeric(N)
for (i in 1:N) {
x<-(sample(rating,df,replace=FALSE)-mu)/sigma
means[i] <- mean(x)
vars[i]<- var(x)
sumsq[i]<-sum(x^2)
}
hist(means^2*df,freq=FALSE,main="Chi-square Distribution") # this quantity is called W in proof 1 of week 7
curve(dchisq(x,1),col='green',add=TRUE) # Big surpise! it fits well!

hist(sumsq,freq=FALSE,main="Chi-square Distribution") # this quantity is called U in proof 1 of week 7
curve(dchisq(x,df),col='red',add=TRUE) # Wow, it fits nicely

hist(vars*(df-1),freq=FALSE,main="Chi-square Distribution") # this quantity is called V in proof 1 of week 7
curve(dchisq(x,df-1),col='violet',add=TRUE) # again, fits well

cor(means^2,vars) # highly uncorrelated
## [1] 0.01869
hist(means/sqrt(vars/df),freq=FALSE,main="Student t Distribution") # the quantity T=(X.bar-mu)/(S/sqrt(n))
# we painstakingly proved in class!
curve(dt(x,df-1),col="green",add=TRUE) # the fit is ok

# This surprising connection with Chi-square distributions might qualify for BONUS 9
# 2) BONUS 16 and REQUIREMENT 1: correlation between reviews and ratings
# As we observe earlier, the normal curves shift towards the left as reviews increase.
# Do more reviews tend to generate lower ratings?
plot(review,rating,xlab="Number of Reviews",ylab='Ratings',main="Scatter plot of ratings against reviews")
cor(review,rating) # they seem to be negatively correlated
## [1] -0.1098
ReviewRating.lm <- lm(rating~num_reviews, data =Yelp);ReviewRating.lm
##
## Call:
## lm(formula = rating ~ num_reviews, data = Yelp)
##
## Coefficients:
## (Intercept) num_reviews
## 3.997697 -0.000183
intercept<-coef(ReviewRating.lm)[1]
slope<-coef(ReviewRating.lm)[2]
abline(intercept,slope,col='blue') # downward slope
# Fit a non-linear line
lines(smooth.spline(rating~review), col = "red")
legend(1000,5, lty=c(1,1),c("linear regression","non-linear"), lwd=c(2.5,2.5),col=c("blue","red"))
PredictRating <- intercept + slope * review
points(review,PredictRating, col = "green",add = TRUE, pch = 20) #these lie on the regression line
## Warning: "add" is not a graphical parameter

ResidRating <- rating - PredictRating
plot(review,ResidRating,main="Residual plot of ratings against reviews") # this is clearly not random
abline(h=0, col = "red")

summary(ReviewRating.lm)$r.squared # BONUS 13 - R^2
## [1] 0.01206
# only 1% data is explained by our model;our model clearly does not work
#Now do 5000 permutations to see whether the actual beta could arise by chance.
N <- 4999; n <- nrow(Yelp)
cor.perm <- numeric(N); beta.perm <- numeric(N)
for (i in 1:N) {
index <- sample(n, replace = FALSE)
rating.perm <- rating[index]
beta.perm[i] <-coef(lm(rating.perm~ Yelp$num_reviews))[2] #beta
cor.perm[i] <-cor(Yelp$num_reviews, rating.perm) #correlation
}
hist(beta.perm, xlim = c(-0.0005,0.0005))
abline(v = slope, col = "red")

# although off the chart, our beta is so close to zero that our linear model fails
hist(cor.perm, xlim = c(-0.2,0.2))
abline(v = cor(review,rating), col = "red") # off the charts

# although significant, the correlation between ratings and reviews is quite weak!
# 3) Correlation between age and activitiy of reviews
# With census data about the population relating to the Yelp restaurant data we have,
# we wonder about the correlation between age and the number of reviews:
# Does the population of 20-to-40-year-old correlate with
# the number of restaurants reviewed on Yelp?
# Create by zip code data frame of # restaurants, # of people
byZip <- data.frame(table(Yelp$zip));head(byZip)
## Var1 Freq
## 1 2108 28
## 2 2109 21
## 3 2110 21
## 4 2111 46
## 5 2113 34
## 6 2114 21
add_column <- numeric(nrow(byZip))
byZip <- data.frame(byZip, add_column);head(byZip)
## Var1 Freq add_column
## 1 2108 28 0
## 2 2109 21 0
## 3 2110 21 0
## 4 2111 46 0
## 5 2113 34 0
## 6 2114 21 0
colnames(byZip) <- c("zip", "num_rest", "age")
for(i in 1:length(byZip$zip))
{
# each zip code has the same population, whichever row that zip code appears in
# so just take the value in the first record returned by this which
byZip$age[i] = Yelp$perct_20.40[min(which(Yelp$zip == byZip$zip[i]))]
}
# make a scatter plot
plot(byZip$age, byZip$num_rest,col=rgb(1/2,0,1/2,1),xlab="% population of 20-to-40-year-old by zipcode",
ylab="Number of restaurants by zipcode",main="Correlation between 20s-to-40s and # of restaurants")
# looks roughly like positive correlation
# check correlation of these two independent variables - BONUS POINT #16
our_corr <- cor(byZip$age, byZip$num_rest); our_corr
## [1] 0.5564
# .56 indicates positive correlation
# find a regression line
rest_by_age <- lm(byZip$num_rest~byZip$age, data=byZip); rest_by_age
##
## Call:
## lm(formula = byZip$num_rest ~ byZip$age, data = byZip)
##
## Coefficients:
## (Intercept) byZip$age
## -10.96 0.78
intercept<-coef(rest_by_age)[1]
slope<-coef(rest_by_age)[2]
# this tells us that, for every additional percentage of 20-to-40-year-old living within a zipcode,
# we will have about roughly 1 more restaurant for that zipcode.
abline(rest_by_age, col="blue")
legend(20,57, lty=c(1,1),c("linear regression line","non-linear line"), lwd=c(2.5,2.5),col=c("blue","red"))
# let's just check that something nonlinear isn't going on:
lines(smooth.spline(byZip$num_rest~byZip$age), col="blue")

# wow, it overlapps nicely with our linear model!
summary(rest_by_age)$r.squared # R^2 is 30%
## [1] 0.3096
# 30% of the data is explained by our linear model
# To use the t distribution, let's first check if we have independent, normally distributed residuals
predictions <- intercept + slope * byZip$age
residuals <- byZip$num_rest - predictions
plot(byZip$age, residuals)
abline(h=0, col="red")

# it looks pretty randomly distributed except a few outliers in the 40-50% range
# we can also bootstrap to create confidence interval. (for BONUS 8)
N <- 10^4
cor.boot <- numeric(N)
slope.boot <- numeric(N)
n <- nrow(byZip)
get_intercept <- function(X, Y)
{
mean(Y) - slope*mean(X)
}
get_slope <- function(X, Y)
{
sum(( X - mean(X)) * ( Y - mean(Y)) / sum((X - mean(X))^2))
}
for(i in 1:N)
{
index <- sample(1:n, n, replace = TRUE)
data.boot <- byZip[index, ]
cor.boot[i] <- cor(data.boot$num_rest, data.boot$age)
slope.boot[i] <- get_slope(data.boot$age, data.boot$num_rest)
}
# REQUIREMENT 3
# Display a 95% Confidence Interval for the correlation
# between the population % and number of restaurants
hist(cor.boot,freq=FALSE,main="Boostrap distribution of correlation coefficients")
abline(v=quantile(cor.boot, c(.025, .975)), col="blue")
abline(v=our_corr, col="red")

quantile(cor.boot, c(.025, .975))
## 2.5% 97.5%
## 0.3854 0.7110
# Find a 95% Confidence Interval for the slope of our model
hist(slope.boot,main="Boostrap distribution of slope of our linear model")
abline(v=quantile(slope.boot, c(.025, .975)), col="blue")
abline(v=slope, col="red")

quantile(slope.boot, c(.025, .975))
## 2.5% 97.5%
## 0.5119 1.0801
# 4) correlation between Asian population and # Chinese restaurants
# I love Chinese cuisine but I struggled to locate a Chinese restaurant nearby,
# so a question arises naturally: Am I living not close enough to where my food lives?
# Statistically, this question translates into the following:
# Is there any correlation between Asian population and # Chinese restaurants
asian<-Yelp$perct_Asian
category<-Yelp$category
index<-which(Yelp$category=="chinese")
chinese<-Yelp[index,] # select all the Chinese restaurants
# Create by zip code data frame of # restaurants, % of Asian
byZip <- data.frame(table(chinese$zip));head(byZip)
## Var1 Freq
## 1 2108 1
## 2 2110 1
## 3 2111 20
## 4 2118 1
## 5 2134 4
## 6 2135 2
add_column <- numeric(nrow(byZip))
byZip <- data.frame(byZip, add_column);head(byZip)
## Var1 Freq add_column
## 1 2108 1 0
## 2 2110 1 0
## 3 2111 20 0
## 4 2118 1 0
## 5 2134 4 0
## 6 2135 2 0
colnames(byZip) <- c("zip", "num_rest", "asian")
for (i in 1:length(byZip$zip))
{
# each zip code has the same Asian population, whichever row that zip code appears in
# so just take the value in the first record returned by this which
byZip$asian[i] <- Yelp$perct_Asian[min(which(Yelp$zip == byZip$zip[i]))]
}
# make a scatter plot
plot(byZip$num_rest,byZip$asian,main="Correlation: Asian population vs Chinese restaurants",
ylab="Asian population % by zipcode",xlab="# of Chinese restuarants by zipcode") #
cor(byZip$num_rest,byZip$asian) # very strong postive correlaton!
## [1] 0.8461
# find a regression line
rest_by_asian <-lm(byZip$asian~byZip$num_rest); rest_by_asian
##
## Call:
## lm(formula = byZip$asian ~ byZip$num_rest)
##
## Coefficients:
## (Intercept) byZip$num_rest
## 9.34 1.57
intercept<-coef(rest_by_asian)[1]
slope<-coef(rest_by_asian)[2]
# this tells us that, for every additional percentage of asian living in a zipcode,
# we will have about roughly 1.5 more Chinese restaurant for that zipcode.
abline(rest_by_asian, col="blue")
legend(10,17, lty=c(1,1),c("linear regression line","non-linear line"), lwd=c(2.5,2.5),col=c("blue","red"))
# let's just check that something nonlinear isn't going on:
lines(smooth.spline(byZip$asian~byZip$num_rest), col="red")

# Wow, it overlapps perfectly with our linear model!
summary(rest_by_asian) # R^2 is 72%! 72% of the data is explained by our model
##
## Call:
## lm(formula = byZip$asian ~ byZip$num_rest)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.235 -2.675 -0.324 1.900 9.806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.337 1.500 6.23 3.1e-05 ***
## byZip$num_rest 1.569 0.274 5.72 7.0e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.95 on 13 degrees of freedom
## Multiple R-squared: 0.716, Adjusted R-squared: 0.694
## F-statistic: 32.8 on 1 and 13 DF, p-value: 7.02e-05
# this finding fits with our intuition, and clearly I have not been living in Asian community
# 5)
# Logistic regression - BONUS 4
# Local businesses can choose to "claim" their business on Yelp, maintaining their Yelp
# profiles and hoping to increase their popularity. But we challenge this assumption:
# Does claiming your business on Yelp correlate with publicity?
Yelp$claimed<-as.logical(toupper(Yelp$claimed))
claimed<-Yelp$claimed
fit <- glm(claimed ~ num_reviews, data = Yelp, family = binomial)
coef(fit) # the coefficients as a vector
## (Intercept) num_reviews
## 0.713716 0.001052
# Calculation: my logistic equation is ln(p/1-p)=0.713716164+0.001051541*x
# we can plot the probability of whether a business is claimed against number of reviews
x <- seq(7, 2132, length=500) # vector spanning the #review range
# compute predicted probabilities
y1 <- exp(coef(fit)[1]+coef(fit)[2]*x) / (1+exp(coef(fit)[1]+coef(fit)[2]*x))
y2 <- plogis(coef(fit)[1] + coef(fit)[2] * x)
plot(Yelp$num_reviews,Yelp$claimed, xlab='number of reviews',main="Correlation: claiming your business vs publicity",
ylab = "Probability of claimed your business")
lines(x, y2,col='blue') # our logistic regression curve
legend(1000,0.5, lty=1,"Built-in glm function", lwd=2.5,col="blue")
# BONUS 15 - Maximum likelihood estimation
# MLE also gives us the parameters
library(stats4)
x<-Yelp$num_reviews;y<-Yelp$claimed
MLL<- function(alpha, beta) -sum( log( exp(alpha+beta*x)/(1+exp(alpha+beta*x)) )*y+
log(1/(1+exp(alpha+beta*x)))*(1-y) )
results<-mle(MLL,start = list(alpha = -0.1, beta = -0.02))
results@coef
## alpha beta
## 0.693355 0.001159
curve( exp(results@coef[1]+results@coef[2]*x)/ (1+exp(results@coef[1]+results@coef[2]*x)),col = "red", add=TRUE)
legend(1000,0.5, lty=c(1,1),c("Built-in glm function","MLE estimation"), lwd=c(2.5,2.5),col=c("blue","red"))

# almost overlays the curve as we got from the glm function
# BONUS 8 + Technical Display 3: We can make boostrap confidence intervals too
N <- 10^3
n <- length(claimed) # number of observations
alpha.boot <- numeric(N)
beta.boot <- numeric(N)
for (i in 1:N)
{
index <- sample(n, replace = TRUE)
claimed.boot <- Yelp[index, ] # resampled data
fit.boot <- glm(claimed ~ num_reviews, data = claimed.boot,
family = binomial)
alpha.boot[i] <- coef(fit.boot)[1] # new intercept
beta.boot[i] <- coef(fit.boot)[2] # new slope
}
quantile(beta.boot, c(.025, .975)) # 95% percentile intervals for the slope
## 2.5% 97.5%
## 0.000187 0.002259
#Now we can look at the distribution of the resampled results
hist(beta.boot,,main="Boostrap distribution of beta",freq=FALSE)
abline(v=quantile(beta.boot, c(.025, .975)),col="green") #95% confidence interval for beta
abline(v=coef(fit)[2],col="blue") # the observation

quantile( beta.boot, c(.025, .975)) # the bootstrap confidence interval for beta
## 2.5% 97.5%
## 0.000187 0.002259
hist(alpha.boot,freq=FALSE,main="Boostrap distribution of alpha")
abline(v=quantile( alpha.boot, c(.025, .975)),col="red") #95% confidence interval for alpha
abline(v=coef(fit)[1],col="blue") # the observation

quantile( alpha.boot, c(.025, .975)) # the bootstrap confidence interval for alpha
## 2.5% 97.5%
## 0.4799 0.9241